R/conservation-districts/other peoples scripts/0_scripts/5_padus_ar_comparisons.R

Defines functions installpkg

# Original Date: 2022-10-14
# Last updated : 2022-10-30 by MB
# M Browning
# Description: Compare PAD-US-AR park cover with sociodemographics, NDVI, and canopy cover
# Concept: Calculate descriptive statistics for PAD-US-AR and other relevant datasets
#          Calculate bivariate correlations between PAD-US-AR and other greenspace metrics
#          Calculate multivariate associations between PAD-US-AR and sociodemographics

# ------------------------------------------------------------------------%
# Set up  -----------------------------------------------------------------
# ------------------------------------------------------------------------%

# function for installing needed packages
installpkg <- function(x) {
  if (x %in% rownames(installed.packages()) == FALSE) {
    if (x %in% rownames(available.packages()) == FALSE) {
      paste(x, "is not a valid package - please check again...")
    } else {
      install.packages(x)
    }
    
  } else {
    paste(x, "package already installed...")
  }
}

# load and install necessary packages
required_packages_1  <-
  c(
    "tidyverse",
    "readr",
    "sf",
    "raster",
    "exactextractr",
    "tictoc",
    "beepr",
    "here",
    "dplyr",
    "stringr",
    "tigris",
    "sjmisc",
    "ggeasy",
    "gridExtra",
    "sjPlot",
    "lme4",
    "correlation"
  )
lapply(required_packages_1, installpkg)
invisible(lapply(required_packages_1, library, character.only = TRUE))

options(tigris_use_cache = TRUE) # so that you only download once

# !! if different values result check package versions !!
# Run with tidyverse 1.3.2, readr 2.1.1, sf 1.0-8, raster 3.5-9,
#          exactextractr 0.7.2, tictoc 1.1, beepr 1.3, here 1.0.1,
#          dplyr 1.0.7, stringr 1.4.0, tigris 1.6.1, sjmisc 2.8,9
#          ggeasy 0.1.3, gridExtra 2.3, sjPlot 2.8.10, lme4 1.1-27.1,
#          correlation 0.7.1

# ------------------------------------------------------------------------%
# census regions ----------------------------------------------------------
# ------------------------------------------------------------------------%

# source: https://data2.nhgis.org/
# 2019 Census Regions - downloaded 07 July 2022
regions <- st_read("../1_source_data/census_regions/regions.shp")

# select states in CONUS, no AK or HI, but add DC
sts <- c(state.abb[-c(2, 11)], "DC")
conus <-
  states(year = 2019) %>% filter(STUSPS %in% sts) %>% st_union() %>% st_sf()

# set CRS for CONUS
conus <- st_transform(conus, st_crs(regions))

NE.name <- c(
  "Connecticut",
  "Maine",
  "Massachusetts",
  "New Hampshire",
  "Rhode Island",
  "Vermont",
  "New Jersey",
  "New York",
  "Pennsylvania"
)
NE.abrv <- c("CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA")
NE.ref <- c(NE.name, NE.abrv)

MW.name <- c(
  "Indiana",
  "Illinois",
  "Michigan",
  "Ohio",
  "Wisconsin",
  "Iowa",
  "Kansas",
  "Minnesota",
  "Missouri",
  "Nebraska",
  "North Dakota",
  "South Dakota"
)
MW.abrv <- c("IN",
             "IL",
             "MI",
             "OH",
             "WI",
             "IA",
             "KS",
             "MN",
             "MO",
             "NE",
             "ND",
             "SD")
MW.ref <- c(MW.name, MW.abrv)

S.name <- c(
  "Delaware",
  "District of Columbia",
  "Florida",
  "Georgia",
  "Maryland",
  "North Carolina",
  "South Carolina",
  "Virginia",
  "West Virginia",
  "Alabama",
  "Kentucky",
  "Mississippi",
  "Tennessee",
  "Arkansas",
  "Louisiana",
  "Oklahoma",
  "Texas"
)
S.abrv <- c(
  "DE",
  "DC",
  "FL",
  "GA",
  "MD",
  "NC",
  "SC",
  "VA",
  "WV",
  "AL",
  "KY",
  "MS",
  "TN",
  "AR",
  "LA",
  "OK",
  "TX"
)
S.ref <- c(S.name, S.abrv)

W.name <- c(
  "Arizona",
  "Colorado",
  "Idaho",
  "New Mexico",
  "Montana",
  "Utah",
  "Nevada",
  "Wyoming",
  "Alaska",
  "California",
  "Hawaii",
  "Oregon",
  "Washington"
)
W.abrv <- c("AZ",
            "CO",
            "ID",
            "NM",
            "MT",
            "UT",
            "NV",
            "WY",
            "AK",
            "CA",
            "HI",
            "OR",
            "WA")
W.ref <- c(W.name, W.abrv)

region.list <- list(
  Northeast = NE.ref,
  Midwest = MW.ref,
  South = S.ref,
  West = W.ref
)

# --------------------------------------------------------------%
# counties -----------------------------------------------------
# --------------------------------------------------------------%

# counties
# source: https://data2.nhgis.org/
# 2019 Census Regions - downloaded 07 July 2022
county <- st_read("../1_source_data/counties/USA_Counties.shp")
county_witharea <-
  sts %>% map_df( ~ counties(state = .x, year = 2019))
county <- st_transform(county, st_crs(regions))
st_crs(county) == st_crs(regions) # should be true

# ------------------------------------------------------------%
# tracts -----------------------------------------------------
# ------------------------------------------------------------%

# tracts
tract2 <- st_read("../1_source_data/tracts/US_tract_2019.shp")
tract_witharea <-
  unique(county$STATEFP) %>% map_df( ~ tracts(state = .x, year = 2019))
tract2 <- st_transform(tract2, st_crs(regions))
st_crs(tract2) == st_crs(regions) # should be true
tract$LandAreakm2 <- tract$ALAND / 1000000

# ----------------------------------------------------------------------%
# park cover data -------------------------------------------------------
# ----------------------------------------------------------------------%

parkc <- read_csv("../3_data/padus_ar_cov_cnty.txt")
parkt <- read_csv("../3_data/padus_ar_cov_btrct.txt")

# ----------------------------------------------------------------------%
# canopy cover ----------------------------------------------------------
# ----------------------------------------------------------------------%

# source: https://www.mrlc.gov/data/nlcd-2016-usfs-tree-canopy-cover-conus
# 2019 release of 2016 tree canopy cover data from MLRC - downloaded 1 Oct 2022

# load data
tree_ras <-
  raster('../1_source_data/NLCD/nlcd_2016_treecanopy_2019_08_31.img')

# match CRS
#tree_ras <- st_transform(tree_ras, crs(regions))
st_crs(county) == st_crs(tree_ras) #should be true
st_crs(tract) == st_crs(tree_ras) #should be true

# focal stats
county$PcCanopy <- exact_extract(tree_ras, county, 'mean')
tract$PcCanopy <- exact_extract(tree_ras, tract, 'mean')
tract$PcCanopy <- tract$PcCanopy / 100

# ----------------------------------------------------------------------%
# annual NDVI -----------------------------------------------------------
# ----------------------------------------------------------------------%

# source: Google Earth Engine
# MODIS 250x250m2 NDVI annual composite from Jan 1 2015 to Dec 31 2020

# load data
NDVI_ras <- raster('../1_source_data/GEE/NDVI_15to20.tif')

# match CRS
st_crs(county) == st_crs(NDVI_ras) #should be true
st_crs(tract) == st_crs(NDVI_ras) #should be true
# ! if these are not true ! use command:
# NDVI_ras <- projectRaster(NDVI_ras, crs = crs(regions))

# focal stats
county$NDVI_annual <- exact_extract(NDVI_ras, county, 'mean')
county$NDVI_annual <- county$NDVI_annual / 10000
tract$NDVI_annual <- exact_extract(NDVI_ras, tract, 'mean')
tract$NDVI_annual <- tract$NDVI_annual / 10000

# ----------------------------------------------------------------------%
# summer high NDVI ------------------------------------------------------
# ----------------------------------------------------------------------%

# source: Google Earth Engine
# MODIS 250x250m2 NDVI annual composite from Jan 1 2015 to Dec 31 2020
# zonal statistics run on GEE
# code at https://code.earthengine.google.com/3984fd87248d4d24c76744fa32e2abeb
# provided CSV files with NDVI mean for counties and tracts

# ---- counties ----

# 2015
dc_gee15 <-
  read_csv("../1_source_data/GEE/NDVI_2015_summertime_counties.csv")
names(dc_gee15) <- paste0(names(dc_gee15), "_15")
dc_gee15$ndvi_mean_15 <- dc_gee15$ndvi_mean_15 / 10000
dc_gee15$GEOID_15 <- as.numeric(dc_gee15$GEOID_15)

# 2016
dc_gee16 <-
  read_csv("../1_source_data/GEE/NDVI_2016_summertime_counties.csv")
names(dc_gee16) <- paste0(names(dc_gee16), "_16")
dc_gee16$ndvi_mean_16 <- dc_gee16$ndvi_mean_16 / 10000
dc_gee16$GEOID_16 <- as.numeric(dc_gee16$GEOID_16)

# 2017
dc_gee17 <-
  read_csv("../1_source_data/GEE/NDVI_2017_summertime_counties.csv")
names(dc_gee17) <- paste0(names(dc_gee17), "_17")
dc_gee17$ndvi_mean_17 <- dc_gee17$ndvi_mean_17 / 10000
dc_gee17$GEOID_17 <- as.numeric(dc_gee17$GEOID_17)

# 2018
dc_gee18 <-
  read_csv("../1_source_data/GEE/NDVI_2018_summertime_counties.csv")
names(dc_gee18) <- paste0(names(dc_gee18), "_18")
dc_gee18$ndvi_mean_18 <- dc_gee18$ndvi_mean_18 / 10000
dc_gee18$GEOID_18 <- as.numeric(dc_gee18$GEOID_18)

# 2019
dc_gee19 <-
  read_csv("../1_source_data/GEE/NDVI_2019_summertime_counties.csv")
names(dc_gee19) <- paste0(names(dc_gee19), "_19")
dc_gee19$ndvi_mean_19 <- dc_gee19$ndvi_mean_19 / 10000
dc_gee19$GEOID_19 <- as.numeric(dc_gee19$GEOID_19)

# 2020
dc_gee20 <-
  read_csv("../1_source_data/GEE/NDVI_2020_summertime_counties.csv")
names(dc_gee20) <- paste0(names(dc_gee20), "_20")
dc_gee20$ndvi_mean_20 <- dc_gee20$ndvi_mean_20 / 10000
dc_gee20$GEOID_20 <- as.numeric(dc_gee20$GEOID_20)

# combine yrs
dc_gee15_16 <-
  sp::merge(dc_gee15, dc_gee16, by.x = 'GEOID_15', by.y = 'GEOID_16')
dc_gee15_16_17 <-
  sp::merge(dc_gee15_16, dc_gee17, by.x = 'GEOID_15', by.y = 'GEOID_17')
dc_gee15_16_17_18 <-
  sp::merge(dc_gee15_16_17, dc_gee18, by.x = 'GEOID_15', by.y = 'GEOID_18')
dc_gee15_16_17_18_19 <-
  sp::merge(dc_gee15_16_17_18, dc_gee19, by.x = 'GEOID_15', by.y = 'GEOID_19')
dc_gee15_16_17_18_19_20 <-
  sp::merge(dc_gee15_16_17_18_19,
            dc_gee20,
            by.x = 'GEOID_15',
            by.y = 'GEOID_20')

# change vars
dc_gee15_16_17_18_19_20$FIPS <-
  sprintf("%05d", dc_gee15_16_17_18_19_20$GEOID_15)
dc_gee15_16_17_18_19_20 <-
  dc_gee15_16_17_18_19_20 %>% dplyr::select(matches("FIPS|ndvi_mean"))
dc_gee15_16_17_18_19_20$NDVI_Sum <-
  (
    dc_gee15_16_17_18_19_20$ndvi_mean_15 + dc_gee15_16_17_18_19_20$ndvi_mean_16 +
      dc_gee15_16_17_18_19_20$ndvi_mean_17 + dc_gee15_16_17_18_19_20$ndvi_mean_18 +
      dc_gee15_16_17_18_19_20$ndvi_mean_19 + dc_gee15_16_17_18_19_20$ndvi_mean_20
  ) / 6

# merge
county <-
  sp::merge(county,
            dc_gee15_16_17_18_19_20,
            by.x = 'FIPS',
            by.y = 'FIPS')

# ---- tracts ----

# 2015
dt_gee15 <-
  read_csv("../1_source_data/GEE/NDVI_2015_summertime_tracts.csv")
names(dt_gee15) <- paste0(names(dt_gee15), "_15")
dt_gee15$ndvi_mean_15 <- dt_gee15$ndvi_mean_15 / 10000
dt_gee15$geoid10_15n <- as.numeric(dt_gee15$geoid10_15)

# 2016
dt_gee16 <-
  read_csv("../1_source_data/GEE/NDVI_2016_summertime_tracts.csv")
names(dt_gee16) <- paste0(names(dt_gee16), "_16")
dt_gee16$ndvi_mean_16 <- dt_gee16$ndvi_mean_16 / 10000
dt_gee16$geoid10_16n <- as.numeric(dt_gee16$geoid10_16)

# 2017
dt_gee17 <-
  read_csv("../1_source_data/GEE/NDVI_2017_summertime_tracts.csv")
names(dt_gee17) <- paste0(names(dt_gee17), "_17")
dt_gee17$ndvi_mean_17 <- dt_gee17$ndvi_mean_17 / 10000
dt_gee17$geoid10_17n <- as.numeric(dt_gee17$geoid10_17)

# 2018
dt_gee18 <-
  read_csv("../1_source_data/GEE/NDVI_2018_summertime_tracts.csv")
names(dt_gee18) <- paste0(names(dt_gee18), "_18")
dt_gee18$ndvi_mean_18 <- dt_gee18$ndvi_mean_18 / 10000
dt_gee18$geoid10_18n <- as.numeric(dt_gee18$geoid10_18)

# 2019
dt_gee19 <-
  read_csv("../1_source_data/GEE/NDVI_2019_summertime_tracts.csv")
names(dt_gee19) <- paste0(names(dt_gee19), "_19")
dt_gee19$ndvi_mean_19 <- dt_gee19$ndvi_mean_19 / 10000
dt_gee19$geoid10_19n <- as.numeric(dt_gee19$geoid10_19)

# 2020
dt_gee20 <-
  read_csv("../1_source_data/GEE/NDVI_2020_summertime_tracts.csv")
names(dt_gee20) <- paste0(names(dt_gee20), "_20")
dt_gee20$ndvi_mean_20 <- dt_gee20$ndvi_mean_20 / 10000
dt_gee20$geoid10_20n <- as.numeric(dt_gee20$geoid10_20)

# combine
dt_gee15_16 <-
  sp::merge(dt_gee15, dt_gee16, by.x = 'geoid10_15', by.y = 'geoid10_16')
dt_gee15_16_17 <-
  sp::merge(dt_gee15_16, dt_gee17, by.x = 'geoid10_15', by.y = 'geoid10_17')
dt_gee15_16_17_18 <-
  sp::merge(dt_gee15_16_17, dt_gee18, by.x = 'geoid10_15', by.y = 'geoid10_18')
dt_gee15_16_17_18_19 <-
  sp::merge(dt_gee15_16_17_18, dt_gee19, by.x = 'geoid10_15', by.y = 'geoid10_19')
dt_gee15_16_17_18_19_20 <-
  sp::merge(dt_gee15_16_17_18_19,
            dt_gee20,
            by.x = 'geoid10_15',
            by.y = 'geoid10_20')

# change vars
dt_gee15_16_17_18_19_20$FIPS <- dt_gee15_16_17_18_19_20$geoid10_15
dt_gee15_16_17_18_19_20 <-
  dt_gee15_16_17_18_19_20 %>% dplyr::select(matches("FIPS|ndvi_mean"))
dt_gee15_16_17_18_19_20$NDVI_Sum <-
  (
    dt_gee15_16_17_18_19_20$ndvi_mean_15 + dt_gee15_16_17_18_19_20$ndvi_mean_16 +
      dt_gee15_16_17_18_19_20$ndvi_mean_17 + dt_gee15_16_17_18_19_20$ndvi_mean_18 +
      dt_gee15_16_17_18_19_20$ndvi_mean_19 + dt_gee15_16_17_18_19_20$ndvi_mean_20
  ) / 6

# merge
tract <-
  sp::merge(tract,
            dt_gee15_16_17_18_19_20,
            by.x = 'GEOID',
            by.y = 'FIPS')

# ----------------------------------------------------------------------%
# sociodemographic data for counties ------------------------------------
# ----------------------------------------------------------------------%

# file 1
county_d1 <-
  read_csv("../1_source_data/NHGIS/nhgis0050_ds244_20195_county.csv") %>%
  rename(
    NHGIS_State = STUSAB,
    Tot = ALT0E001,
    Female = ALT0E026,
    NHWhite = ALUKE003,
    NHBlack = ALUKE004,
    NHAsian = ALUKE006,
    Hisp = ALUKE012,
    EducTot = ALWGE001,
    EducHS_1 = ALWGE017,
    EducHS_2 = ALWGE018,
    EducHS_3 = ALWGE019,
    EducHS_4 = ALWGE020,
    EducHS_5 = ALWGE021,
    EducColl_1 = ALWGE022,
    EducColl_2 = ALWGE023,
    EducColl_3 = ALWGE024,
    EducColl_4 = ALWGE025,
    PovTot = ALWYE001,
    Pov = ALWYE002,
    MedHHInc = ALW1E001,
    UnempTot = ALY3E001,
    Unemp = ALY3E007,
    OldAgeM_1 = ALT0E020,
    OldAgeM_2 = ALT0E021,
    OldAgeM_3 = ALT0E022,
    OldAgeM_4 = ALT0E023,
    OldAgeM_5 = ALT0E024,
    OldAgeM_6 = ALT0E025,
    OldAgeF_1 = ALT0E044,
    OldAgeF_2 = ALT0E045,
    OldAgeF_3 = ALT0E046,
    OldAgeF_4 = ALT0E047,
    OldAgeF_5 = ALT0E048,
    OldAgeF_6 = ALT0E049,
    MedHHValue = AL1HE001
  ) %>%
  dplyr::select(
    matches(
      "NHGIS|GEOID|ISJOIN|Tot|Female|NHWhite|NHBlack|NHAsian|Hisp|Educ|Pov|Med|Unemp|OldAge"
    )
  )

county_d1 <- county_d1[-1,]

numvars <-
  c(
    'Tot',
    'Female' ,
    'NHWhite',
    'NHBlack',
    'NHAsian',
    'Hisp',
    'EducTot',
    'EducHS_1',
    'EducHS_2',
    'EducHS_3',
    'EducHS_4',
    'EducHS_5',
    'EducColl_1',
    'EducColl_2',
    'EducColl_3',
    'EducColl_4',
    'PovTot',
    'Pov',
    'MedHHInc',
    'UnempTot',
    'Unemp',
    'OldAgeM_1',
    'OldAgeM_2',
    'OldAgeM_3',
    'OldAgeM_4',
    'OldAgeM_5',
    'OldAgeM_6',
    'OldAgeF_1',
    'OldAgeF_2',
    'OldAgeF_3',
    'OldAgeF_4',
    'OldAgeF_5',
    'OldAgeF_6',
    'MedHHValue'
  )
county_d1[, numvars] <- lapply(county_d1[, numvars], as.numeric)

county_d1$PercFemale <- county_d1$Female / county_d1$Tot
county_d1$PercNHWhite <- county_d1$NHWhite / county_d1$Tot
county_d1$PercNHWhite <- county_d1$NHWhite / county_d1$Tot
county_d1$PercNHBlack <- county_d1$NHBlack / county_d1$Tot
county_d1$PercNHAsian <- county_d1$NHAsian / county_d1$Tot
county_d1$PercHisp <- county_d1$Hisp / county_d1$Tot
county_d1$PercEducHS <-
  (
    county_d1$EducHS_1 + county_d1$EducHS_2 + county_d1$EducHS_3 + county_d1$EducHS_4 +
      county_d1$EducHS_5 + county_d1$EducColl_1 + county_d1$EducColl_2 + county_d1$EducColl_3 +
      county_d1$EducColl_4
  ) / county_d1$EducTot
county_d1$PercEducColl <-
  (
    county_d1$EducColl_1 + county_d1$EducColl_2 + county_d1$EducColl_3 + county_d1$EducColl_4
  ) / county_d1$EducTot
county_d1$PercPov <- county_d1$Pov / county_d1$PovTot
county_d1$PercUnemp <- county_d1$Unemp / county_d1$UnempTot
county_d1$Perc65Up <-
  (
    county_d1$OldAgeM_1 + county_d1$OldAgeM_2 + county_d1$OldAgeM_3 + county_d1$OldAgeM_4 +
      county_d1$OldAgeM_5 + county_d1$OldAgeM_6 + county_d1$OldAgeF_1 + county_d1$OldAgeF_2 +
      county_d1$OldAgeF_3 + county_d1$OldAgeF_4 + county_d1$OldAgeF_5 + county_d1$OldAgeF_6
  ) / county_d1$Tot

county_d1 <-
  county_d1 %>% dplyr::select(matches("Tot|NHGIS|GISJOIN|GEOID|Perc|Med"))
county_d1 <- county_d1 %>% filter(!is.na(county_d1$MedHHInc))

county_d1$FIPS <- str_sub(county_d1$GEOID, -5, -1)

# file 2
county_d2 <-
  read_csv("../1_source_data/NHGIS/nhgis0050_ds245_20195_county.csv") %>%
  rename(
    NHGIS_State = STUSAB,
    Gini = AMEME001,
    EmpTot = AMHRE001,
    EmpNR = AMHRE002
  ) %>%
  dplyr::select(matches("NHGIS_State|GEOID|GISJOIN|Gini|EmpTot|EmpNR"))

county_d2 <- county_d2[-1,]

numvars <- c('Gini' , 'EmpTot', 'EmpNR')
county_d2[, numvars] <- lapply(county_d2[, numvars], as.numeric)

county_d2$PercEmpNR <- county_d2$EmpNR / county_d2$EmpTot

county_d2 <-
  county_d2 %>% dplyr::select(matches("GISJOIN|NHGIS|GEOID|Gini|Perc"))
county_d2 <- county_d2 %>% filter(!is.na(county_d2$PercEmpNR))

county_d2$FIPS <- str_sub(county_d2$GEOID, -5, -1)

# ----------------------------------------------------------------------%
# sociodemographic data for tracts --------------------------------------
# ----------------------------------------------------------------------%

# file 1
tract_d1 <-
  read_csv("../1_source_data/NHGIS/nhgis0049_ds244_20195_tract.csv") %>%
  rename(
    NHGIS_State = STUSAB,
    TotPop = ALT0E001,
    Female = ALT0E026,
    NHWhite = ALUKE003,
    NHBlack = ALUKE004,
    NHAsian = ALUKE006,
    Hisp = ALUKE012,
    EducTot = ALWGE001,
    EducHS_1 = ALWGE017,
    EducHS_2 = ALWGE018,
    EducHS_3 = ALWGE019,
    EducHS_4 = ALWGE020,
    EducHS_5 = ALWGE021,
    EducColl_1 = ALWGE022,
    EducColl_2 = ALWGE023,
    EducColl_3 = ALWGE024,
    EducColl_4 = ALWGE025,
    PovTot = ALWYE001,
    Pov = ALWYE002,
    MedHHInc = ALW1E001,
    UnempTot = ALY3E001,
    Unemp = ALY3E007,
    OldAgeM_1 = ALT0E020,
    OldAgeM_2 = ALT0E021,
    OldAgeM_3 = ALT0E022,
    OldAgeM_4 = ALT0E023,
    OldAgeM_5 = ALT0E024,
    OldAgeM_6 = ALT0E025,
    OldAgeF_1 = ALT0E044,
    OldAgeF_2 = ALT0E045,
    OldAgeF_3 = ALT0E046,
    OldAgeF_4 = ALT0E047,
    OldAgeF_5 = ALT0E048,
    OldAgeF_6 = ALT0E049
  ) %>%
  dplyr::select(
    matches(
      "NHGIS|GISJOIN|Tot|Female|NHWhite|NHBlack|NHAsian|Hisp|Educ|Pov|Med|Unemp|OldAge"
    )
  )

tract_d1 <- tract_d1[-1,]
numvars <-
  c(
    'TotPop',
    'Female' ,
    'NHWhite',
    'NHBlack',
    'NHAsian',
    'Hisp',
    'EducTot',
    'EducHS_1',
    'EducHS_2',
    'EducHS_3',
    'EducHS_4',
    'EducHS_5',
    'EducColl_1',
    'EducColl_2',
    'EducColl_3',
    'EducColl_4',
    'PovTot',
    'Pov',
    'MedHHInc',
    'UnempTot',
    'Unemp',
    'OldAgeM_1',
    'OldAgeM_2',
    'OldAgeM_3',
    'OldAgeM_4',
    'OldAgeM_5',
    'OldAgeM_6',
    'OldAgeF_1',
    'OldAgeF_2',
    'OldAgeF_3',
    'OldAgeF_4',
    'OldAgeF_5',
    'OldAgeF_6'
  )
tract_d1[, numvars] <- lapply(tract_d1[, numvars], as.numeric)
tract_d1$PercFemale <- tract_d1$Female / tract_d1$TotPop
tract_d1$PercNHWhite <- tract_d1$NHWhite / tract_d1$TotPop
tract_d1$PercNHWhite <- tract_d1$NHWhite / tract_d1$TotPop
tract_d1$PercNHBlack <- tract_d1$NHBlack / tract_d1$TotPop
tract_d1$PercNHAsian <- tract_d1$NHAsian / tract_d1$TotPop
tract_d1$PercHisp <- tract_d1$Hisp / tract_d1$TotPop
tract_d1$PercEducHS <-
  (
    tract_d1$EducHS_1 + tract_d1$EducHS_2 + tract_d1$EducHS_3 + tract_d1$EducHS_4 +
      tract_d1$EducHS_5 + tract_d1$EducColl_1 + tract_d1$EducColl_2 + tract_d1$EducColl_3 +
      tract_d1$EducColl_4
  ) / tract_d1$EducTot
tract_d1$PercEducColl <-
  (
    tract_d1$EducColl_1 + tract_d1$EducColl_2 + tract_d1$EducColl_3 + tract_d1$EducColl_4
  ) / tract_d1$EducTot
tract_d1$PercPov <- tract_d1$Pov / tract_d1$PovTot
tract_d1$PercUnemp <- tract_d1$Unemp / tract_d1$UnempTot
tract_d1$Perc65Up <-
  (
    tract_d1$OldAgeM_1 + tract_d1$OldAgeM_2 + tract_d1$OldAgeM_3 + tract_d1$OldAgeM_4 +
      tract_d1$OldAgeM_5 + tract_d1$OldAgeM_6 + tract_d1$OldAgeF_1 + tract_d1$OldAgeF_2 +
      tract_d1$OldAgeF_3 + tract_d1$OldAgeF_4 + tract_d1$OldAgeF_5 + tract_d1$OldAgeF_6
  ) / tract_d1$TotPop

tract_d1 <-
  tract_d1 %>% dplyr::select(matches("NHGIS|GISJOIN|Perc|MedHHInc|TotPop"))
tract_d1 <- tract_d1 %>% filter(!is.na(tract_d1$MedHHInc))

# file 2
tract_d2 <-
  read_csv("../1_source_data/NHGIS/nhgis0049_ds245_20195_tract.csv") %>%
  rename(
    NHGIS_State = STUSAB,
    Gini = AMEME001,
    EmpTot = AMHRE001,
    EmpNR = AMHRE002
  ) %>%
  dplyr::select(matches("NHGIS_State|GISJOIN|Gini|EmpTot|EmpNR"))

tract_d2 <- tract_d2[-1,]

numvars <- c('Gini' , 'EmpTot', 'EmpNR')
tract_d2[, numvars] <- lapply(tract_d2[, numvars], as.numeric)

tract_d2$PercEmpNR <- tract_d2$EmpNR / tract_d2$EmpTot

tract_d2 <-
  tract_d2 %>% dplyr::select(matches("GISJOIN|NHGIS|Gini|Perc"))

tract_d2 <- tract_d2 %>% filter(!is.na(tract_d2$PercEmpNR))

# file 3
tract_d3 <-
  read_csv("../1_source_data/NHGIS/nhgis0051_ds244_20195_tract.csv") %>%
  rename(NHGIS_State = STUSAB,
         MedHHValue = AL1HE001) %>%
  dplyr::select(matches("GISJOIN|MedHHValue|GEOID"))

tract_d3$GEOID_11 <- str_sub(tract_d3$GEOID, -11, -1)

tract_d3 <- tract_d3[-1,]

numvars <- c('MedHHValue')
tract_d3[, numvars] <- lapply(tract_d3[, numvars], as.numeric)

tract_d3 <- tract_d3 %>% filter(!is.na(tract_d3$MedHHValue))

# ----------------------------------------------------------------------%
# merge data ------------------------------------------------------------
# ----------------------------------------------------------------------%

# ---- counties ----

# first transform shapefile with tree and NDVI values into dataframe
county_df <- as.data.frame(county)

# merges
county_d3 <-
  sp::merge(county_d1, county_d2, by.x = 'GISJOIN', by.y = 'GISJOIN') # add demographics
county_d4 <-
  sp::merge(county_d3,
            county_df,
            by.x = 'FIPS.y',
            by.y = 'FIPS',
            all.y = F) # add shapefile dataframe
county_d6 <-
  sp::merge(county_d4, parkc, by.x = "FIPS.y", by.y = "GEOID") # add PAD-US-AR cover

# rename long var names that will be truncated by GIS
county_d7 <-
  county_d6 %>% dplyr::select (-c(EducTot, PovTot, UnempTot)) %>%
  rename(
    TotPop = Tot,
    MedHInc = MedHHInc,
    MedHVal = MedHHValue,
    PcFem = PercFemale,
    PcWhite = PercNHWhite,
    PcBlack = PercNHBlack,
    PcAsian = PercNHAsian,
    PcHisp = PercHisp,
    PcHighSc = PercEducHS,
    PcColl = PercEducColl,
    PcPov = PercPov,
    PcUnemp = PercUnemp,
    Pc65Up = Perc65Up,
    PcEmpNR = PercEmpNR,
    State = NHGIS_State.x,
    FIPS = FIPS.y,
    PcPark = pc_park,
    NDVI_S15 = ndvi_mean_15,
    NDVI_S16 = ndvi_mean_16,
    NDVI_S17 = ndvi_mean_17,
    NDVI_S18 = ndvi_mean_18,
    NDVI_S19 = ndvi_mean_19,
    NDVI_S20 = ndvi_mean_20,
    NDVI_Sum = NDVI_Sum,
    NDVI_Ann = NDVI_annual
  )

# calculate pop density
county_d7$PopDens <- county_d7$POP_SQMI * .3861

# add region
county_d8 <- county_d7
county_d8$Region <- sapply(county_d8$State,
                           function(x)
                             names(region.list)[grep(x, region.list)])
table(county_d8$Region, county_d8$State)

# reorder variables
county_d9 <-
  county_d8[, c(
    "FIPS",
    "State",
    "Region",
    "TotPop",
    "PopDens",
    "PcFem",
    "PcWhite",
    "PcBlack",
    "PcAsian",
    "PcHisp",
    "Pc65Up",
    "MedHInc",
    "MedHVal",
    "PcPov",
    "Gini",
    "PcUnemp",
    "PcEmpNR",
    "PcHighSc",
    "PcColl",
    "NDVI_Sum",
    "NDVI_Ann",
    "PcCanopy",
    "PcPark"
  )]

# make park and canopy cover on same scale as NDVI
county_d9$PcPark <- county_d9$PcPark / 100
county_d9$PcCanopy <- county_d9$PcCanopy / 100

# name final dataset
county_F <- county_d9

# separate into regions
county_NE <- county_F[county_F$Region == "Northeast",]
county_MW <- county_F[county_F$Region == "Midwest",]
county_SO <- county_F[county_F$Region == "South",]
county_WT <- county_F[county_F$Region == "West",]
county_NEMW <- rbind(county_NE, county_MW)

# can add back to spatial data for visualizations
county_complete <-
  sp::merge(county, county_F, by.x = 'FIPS', by.y = 'FIPS')
write_sf(county_complete, "../3_data/CountyValues.shp")
county_complete_NE <-
  county_complete[county_complete$Region == "Northeast",]
write_sf(county_complete_NE, "../3_data/CountyValuesNE.shp")
county_complete_MW <-
  county_complete[county_complete$Region == "Midwest",]
write_sf(county_complete_MW, "../3_data/CountyValuesMW.shp")
county_complete_SO <-
  county_complete[county_complete$Region == "South",]
write_sf(county_complete_SO, "../3_data/CountyValuesSO.shp")
county_complete_WT <-
  county_complete[county_complete$Region == "West",]
write_sf(county_complete_WT, "../3_data/CountyValuesWT.shp")

# ---- tracts ----

# first transform shapefile with tree and NDVI values into dataframe
tract_df <- as.data.frame(tract)

# merges
tract_d4 <-
  sp::merge(tract_d1, tract_d2, by.x = 'GISJOIN', by.y = 'GISJOIN') # add demographics
tract_d5 <-
  sp::merge(tract_d3, tract_d4, by.x = 'GISJOIN', by.y = 'GISJOIN') # add more demographics
tract_d6 <-
  sp::merge(tract_d5, tract, by.x = 'GEOID_11', by.y = 'GEOID') # add tree and NDVI values
tract_d7 <-
  sp::merge(tract_d6, parkt, by.x = 'GEOID_11', by.y = "GEOID") # add PAD-US-AR cover

# rename long var names that will be truncated by GIS
tract_d8 <-
  tract_d7 %>%
  rename(
    GISJOIN = GISJOIN.x,
    MedHInc = MedHHInc,
    MedHVal = MedHHValue,
    PcFem = PercFemale,
    PcWhite = PercNHWhite,
    PcBlack = PercNHBlack,
    PcAsian = PercNHAsian,
    PcHisp = PercHisp,
    PcHighSc = PercEducHS,
    PcColl = PercEducColl,
    PcPov = PercPov,
    PcUnemp = PercUnemp,
    Pc65Up = Perc65Up,
    PcEmpNR = PercEmpNR,
    State = NHGIS_State.x,
    PcPark = pc_park,
    NDVI_S15 = ndvi_mean_15,
    NDVI_S16 = ndvi_mean_16,
    NDVI_S17 = ndvi_mean_17,
    NDVI_S18 = ndvi_mean_18,
    NDVI_S19 = ndvi_mean_19,
    NDVI_S20 = ndvi_mean_20,
    NDVI_Sum = NDVI_Sum,
    NDVI_Ann = NDVI_annual
  )

# calculate pop density
tract_d8$PopDens <- tract_d8$TotPop / tract_d8$LandAreakm2

# add region
tract_d8$Region <- sapply(tract_d8$State,
                          function(x)
                            names(region.list)[grep(x, region.list)])
table(tract_d8$Region, tract_d8$State)

# reorder variables
tract_d9 <-
  tract_d8[, c(
    "GEOID",
    "GEOID_11",
    "GISJOIN",
    "State",
    "Region",
    "TotPop",
    "PopDens",
    "PcFem",
    "PcWhite",
    "PcBlack",
    "PcAsian",
    "PcHisp",
    "Pc65Up",
    "MedHInc",
    "MedHVal",
    "PcPov",
    "Gini",
    "PcUnemp",
    "PcEmpNR",
    "PcHighSc",
    "PcColl",
    "NDVI_Sum",
    "NDVI_Ann",
    "PcCanopy",
    "PcPark"
  )]

# make park cover on same scale as NDVI and canopy cover
tract_d9$PcPark <- tract_d9$PcPark / 100

# name final dataset
tract_F <- tract_d9

# separate into regions
tract_NE <- tract_F[tract_F$Region == "Northeast",]
tract_MW <- tract_F[tract_F$Region == "Midwest",]
tract_SO <- tract_F[tract_F$Region == "South",]
tract_WT <- tract_F[tract_F$Region == "West",]

# can add back to spatial data for visualizations
tract_complete <-
  sp::merge(tract, tract_F, by.x = 'GEOID', by.y = 'GEOID_11')
write_sf(tract_complete, "../3_data/TractValues.shp")
tract_complete_NE <-
  tract_complete[tract_complete$Region == "Northeast",]
write_sf(tract_complete_NE, "../3_data/TractValuesNE.shp")
tract_complete_MW <-
  tract_complete[tract_complete$Region == "Midwest",]
write_sf(tract_complete_MW, "../3_data/TractValuesMW.shp")
tract_complete_SO <-
  tract_complete[tract_complete$Region == "South",]
write_sf(tract_complete_SO, "../3_data/TractValuesSO.shp")
tract_complete_WT <-
  tract_complete[tract_complete$Region == "West",]
write_sf(tract_complete_WT, "../3_data/TractValuesWT.shp")

# ----------------------------------------------------------------------%
# comparing nature metrics ----------------------------------------------
# ----------------------------------------------------------------------%

# ----- desc stats -----

library(dplyr)
#nationwide
county_F %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer",
        show = c("n", "md", "iqr", "range", "miss"))
tract_F %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

#Northeast
county_NE %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_NE %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

#Midwest
county_MW %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_MW %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

#South
county_SO %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_SO %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

#West
county_WT %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_WT %>% dplyr::select(PcPark, PcCanopy, NDVI_Ann, NDVI_Sum) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ----- Create ggplot histogram theme -----

hist_theme <- theme_bw() +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank()) +
  ggeasy::easy_center_title() +
  theme(
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.line = element_line(colour = "black")
  )

# ----- PcPark histograms -----

hist_PcPark_c <- ggplot(county_F, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide counties") +
  xlim(0, 1) + hist_theme
hist_PcPark_c_NE <- ggplot(county_NE, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern counties") +
  xlim(0, 1) + hist_theme
hist_PcPark_c_MW <- ggplot(county_MW, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern counties") +
  xlim(0, 1) + hist_theme
hist_PcPark_c_SO <- ggplot(county_SO, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern counties") +
  xlim(0, 1) + hist_theme
hist_PcPark_c_WT <- ggplot(county_WT, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western counties") +
  xlim(0, 1) + hist_theme
hist_PcPark_t <- ggplot(tract_F, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide tracts") +
  xlim(0, 1) + hist_theme
hist_PcPark_t_NE <- ggplot(tract_NE, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern tracts") +
  xlim(0, 1) + hist_theme
hist_PcPark_t_MW <- ggplot(tract_MW, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern tracts") +
  xlim(0, 1) + hist_theme
hist_PcPark_t_SO <- ggplot(tract_SO, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern tracts") +
  xlim(0, 1) + hist_theme
hist_PcPark_t_WT <- ggplot(tract_WT, aes(PcPark)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western tracts") +
  xlim(0, 1) + hist_theme

hist_PcPark_all <- grid.arrange(
  hist_PcPark_c,
  hist_PcPark_t,
  hist_PcPark_c_NE,
  hist_PcPark_t_NE,
  hist_PcPark_c_MW,
  hist_PcPark_t_MW,
  hist_PcPark_c_SO,
  hist_PcPark_t_SO,
  hist_PcPark_c_WT,
  hist_PcPark_t_WT,
  nrow = 5
)

ggsave(
  "../PcPark_Hist.png",
  plot = hist_PcPark_all,
  width = 8,
  height = 6,
  units = "in",
  dpi = 300
)

# ----- NDVI_Ann histograms -----

hist_NDVI_Ann_c <- ggplot(county_F, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_c_NE <- ggplot(county_NE, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_c_MW <- ggplot(county_MW, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_c_SO <- ggplot(county_SO, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_c_WT <- ggplot(county_WT, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_t <- ggplot(tract_F, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_t_NE <- ggplot(tract_NE, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_t_MW <- ggplot(tract_MW, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_t_SO <- ggplot(tract_SO, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Ann_t_WT <- ggplot(tract_WT, aes(NDVI_Ann)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western tracts") +
  xlim(0, 1) + hist_theme

hist_NDVI_Ann_all <- grid.arrange(
  hist_NDVI_Ann_c,
  hist_NDVI_Ann_t,
  hist_NDVI_Ann_c_NE,
  hist_NDVI_Ann_t_NE,
  hist_NDVI_Ann_c_MW,
  hist_NDVI_Ann_t_MW,
  hist_NDVI_Ann_c_SO,
  hist_NDVI_Ann_t_SO,
  hist_NDVI_Ann_c_WT,
  hist_NDVI_Ann_t_WT,
  nrow = 5
)

ggsave(
  "../NDVI_Ann_Hist.png",
  plot = hist_NDVI_Ann_all,
  width = 8,
  height = 6,
  units = "in",
  dpi = 300
)

# ----- NDVI_Sum histograms -----

hist_NDVI_Sum_c <- ggplot(county_F, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_c_NE <- ggplot(county_NE, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_c_MW <- ggplot(county_MW, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_c_SO <- ggplot(county_SO, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_c_WT <- ggplot(county_WT, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western counties") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_t <- ggplot(tract_F, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_t_NE <- ggplot(tract_NE, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_t_MW <- ggplot(tract_MW, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_t_SO <- ggplot(tract_SO, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern tracts") +
  xlim(0, 1) + hist_theme
hist_NDVI_Sum_t_WT <- ggplot(tract_WT, aes(NDVI_Sum)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western tracts") +
  xlim(0, 1) + hist_theme

hist_NDVI_Sum_all <- grid.arrange(
  hist_NDVI_Sum_c,
  hist_NDVI_Sum_t,
  hist_NDVI_Sum_c_NE,
  hist_NDVI_Sum_t_NE,
  hist_NDVI_Sum_c_MW,
  hist_NDVI_Sum_t_MW,
  hist_NDVI_Sum_c_SO,
  hist_NDVI_Sum_t_SO,
  hist_NDVI_Sum_c_WT,
  hist_NDVI_Sum_t_WT,
  nrow = 5
)

ggsave(
  "../NDVI_Sum_Hist.png",
  plot = hist_NDVI_Sum_all,
  width = 8,
  height = 6,
  units = "in",
  dpi = 300
)

# ----- PcCanopy histograms -----

hist_PcCanopy_c <- ggplot(county_F, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide counties") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_c_NE <- ggplot(county_NE, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern counties") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_c_MW <- ggplot(county_MW, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern counties") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_c_SO <- ggplot(county_SO, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern counties") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_c_WT <- ggplot(county_WT, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western counties") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_t <- ggplot(tract_F, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#3B6AA8") + ggtitle("Nationwide tracts") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_t_NE <- ggplot(tract_NE, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#32A229") + ggtitle("Northeastern tracts") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_t_MW <- ggplot(tract_MW, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#EDA247") + ggtitle("Midwestern tracts") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_t_SO <- ggplot(tract_SO, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#DB4325") + ggtitle("Southern tracts") +
  xlim(0, 1) + hist_theme
hist_PcCanopy_t_WT <- ggplot(tract_WT, aes(PcCanopy)) +
  geom_histogram(color = NA, fill = "#F8DE27") + ggtitle("Western tracts") +
  xlim(0, 1) + hist_theme

hist_PcCanopy_all <- grid.arrange(
  hist_PcCanopy_c,
  hist_PcCanopy_t,
  hist_PcCanopy_c_NE,
  hist_PcCanopy_t_NE,
  hist_PcCanopy_c_MW,
  hist_PcCanopy_t_MW,
  hist_PcCanopy_c_SO,
  hist_PcCanopy_t_SO,
  hist_PcCanopy_c_WT,
  hist_PcCanopy_t_WT,
  nrow = 5
)

ggsave(
  "../PcCanopy_Hist.png",
  plot = hist_PcCanopy_all,
  width = 8,
  height = 6,
  units = "in",
  dpi = 300
)

# ----- correlations -----

county_Fn <-
  county_F %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
county_NEn <-
  county_NE %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
county_MWn <-
  county_MW %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
county_SOn <-
  county_SO %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
county_WTn <-
  county_WT %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
tract_Fn <-
  tract_F %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
tract_NEn <-
  tract_NE %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
tract_MWn <-
  tract_MW %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
tract_SOn <-
  tract_SO %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)
tract_WTn <-
  tract_WT %>% dplyr::select(PcPark, NDVI_Ann, NDVI_Sum, PcCanopy)

correlation::correlation(county_Fn, decimals = 2)
correlation::correlation(county_NEn, decimals = 2)
correlation::correlation(county_MWn, decimals = 2)
correlation::correlation(county_SOn, decimals = 2)
correlation::correlation(county_WTn, decimals = 2)
correlation::correlation(tract_Fn, decimals = 2)
correlation::correlation(tract_NEn, decimals = 2)
correlation::correlation(tract_MWn, decimals = 2)
correlation::correlation(tract_SOn, decimals = 2)
correlation::correlation(tract_WTn, decimals = 2)

# ----------------------------------------------------------------------%
# sociodemographic desc stats ------------------------------------------
# ----------------------------------------------------------------------%

# ----- nationwide -----

county_F %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_F %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ----- Northeast -----
county_NE %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_NE %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ----- Midwest -----
county_MW %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_MW %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ----- South -----
county_SO %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_SO %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ----- West -----
county_WT %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))
tract_WT %>% dplyr::select(
  PopDens,
  MedHInc,
  MedHVal,
  PcPov,
  Gini,
  PcHighSc,
  PcColl,
  PcUnemp,
  PcEmpNR,
  PcBlack,
  PcAsian,
  PcHisp,
  Pc65Up,
  PcFem,
  TotPop
) %>%
  descr(out = "viewer", show = c("n", "md", "iqr", "range"))

# ------------------------------------------------------%
# regressions ------------------------------------------
# ------------------------------------------------------%

# specify vif code
vif.lme <- function (fit) {
  v <- vcov(fit)
  nam <- names(fixef(fit))
  ns <- sum(1 *
              (nam == "Intercept" | nam == "(Intercept)"))
  if (ns > 0) {
    v <- v[-(1:ns), -(1:ns), drop = FALSE]
    nam <- nam[-(1:ns)]
  }
  d <- diag(v) ^ 0.5
  v <- diag(solve(v / (d %o% d)))
  names(v) <- nam
  v
}

# ----- main model -----
glm <-
  PcPark ~ PopDens + MedHVal + PcPov + Gini + PcHighSc + PcColl + PcUnemp + PcEmpNR +
  PcBlack + PcAsian + PcHisp + Pc65Up + PcFem + TotPop + (1 | State)

glm_Fc <- lme4::lmer(glm, data = county_F, REML = F)
glm_Ft <- lme4::lmer(glm, data = tract_F, REML = F)
glm_NEc <- lme4::lmer(glm, data = county_NE, REML = F)
glm_NEt <- lme4::lmer(glm, data = tract_NE, REML = F)
glm_MWc <- lme4::lmer(glm, data = county_MW, REML = F)
glm_MWt <- lme4::lmer(glm, data = tract_MW, REML = F)
glm_SOc <- lme4::lmer(glm, data = county_SO, REML = F)
glm_SOt <- lme4::lmer(glm, data = tract_SO, REML = F)
glm_WTc <- lme4::lmer(glm, data = county_WT, REML = F)
glm_WTt <- lme4::lmer(glm, data = tract_WT, REML = F)

# residuals
par(mfrow = c(2, 2))
plot(glm_Fc)
plot(glm_Ft)

# VIF
vif.lme(glm_Fc)
vif.lme(glm_Ft)

# report results
sjPlot::tab_model(
  glm_Fc,
  glm_Ft,
  glm_NEc,
  glm_NEt,
  glm_MWc,
  glm_MWt,
  glm_SOc,
  glm_SOt,
  glm_WTc,
  glm_WTt,
  dv.labels = c("Counties", "Tracts"),
  show.intercept = F,
  show.est = F,
  show.se = F,
  show.ci = F,
  show.std	= T,
  show.p = T,
  show.r2 = T,
  show.re.var = T,
  show.fstat = T,
  show.aic = T
)

# ----- alternative model -----
glm2 <-
  PcPark ~ PopDens + MedHInc + PcBlack + PcAsian + PcHisp + Pc65Up + PcFem + TotPop +
  (1 | State)

glm2_Fc <- lme4::lmer(glm2, data = county_F, REML = F)
glm2_Ft <- lme4::lmer(glm2, data = tract_F, REML = F)
glm2_NEc <- lme4::lmer(glm2, data = county_NE, REML = F)
glm2_NEt <- lme4::lmer(glm2, data = tract_NE, REML = F)
glm2_MWc <- lme4::lmer(glm2, data = county_MW, REML = F)
glm2_MWt <- lme4::lmer(glm2, data = tract_MW, REML = F)
glm2_SOc <- lme4::lmer(glm2, data = county_SO, REML = F)
glm2_SOt <- lme4::lmer(glm2, data = tract_SO, REML = F)
glm2_WTc <- lme4::lmer(glm2, data = county_WT, REML = F)
glm2_WTt <- lme4::lmer(glm2, data = tract_WT, REML = F)

# residuals
par(mfrow = c(2, 2))
plot(glm2_Fc)
plot(glm2_Ft)

# VIF
vif.lme(glm2_Fc)
vif.lme(glm2_Ft)

# report results
sjPlot::tab_model(
  glm2_Fc,
  glm2_Ft,
  glm2_NEc,
  glm2_NEt,
  glm2_MWc,
  glm2_MWt,
  glm2_SOc,
  glm2_SOt,
  glm2_WTc,
  glm2_WTt,
  dv.labels = c("Counties", "Tracts"),
  show.intercept = F,
  show.est = F,
  show.se = F,
  show.ci = F,
  show.std	= T,
  show.p = T,
  show.r2 = T,
  show.re.var = T,
  show.fstat = T,
  show.aic = T
)

# ----- urban model -----

# Testing population density cut-points
nrow(county_F[(county_F$PopDens >= 1000),]) # 45 TOO small for nationwide analyses

nrow(county_NE[(county_NE$PopDens >= 300),]) # 43 TOO small for Northeastern analyses
nrow(county_MW[(county_MW$PopDens >= 300),]) # 30 TOO small for Midwestern analyses
nrow(county_SO[(county_SO$PopDens >= 300),]) # 93 small for Southern analyses
nrow(county_WT[(county_WT$PopDens >= 300),]) # 16 TOO small for Western analyses

nrow(county_NE[(county_NE$PopDens >= 50),]) # 121 Northeastern counties
nrow(county_MW[(county_MW$PopDens >= 50),]) # 178 Midwestern counties
nrow(county_SO[(county_SO$PopDens >= 50),]) # 386 Southern counties
nrow(county_WT[(county_WT$PopDens >= 50),]) # 58 Western counties

# Limit counties and tracts
county_Fu <- county_F[(county_F$PopDens >= 50),]
county_NEu <- county_NE[(county_NE$PopDens >= 50),]
county_MWu <- county_MW[(county_MW$PopDens >= 50),]
county_SOu <- county_SO[(county_SO$PopDens >= 50),]
county_WTu <- county_WT[(county_WT$PopDens >= 50),]
tract_Fu <- tract_F[(tract_F$PopDens >= 1000),]
tract_NEu <- tract_NE[(tract_NE$PopDens >= 1000),]
tract_MWu <- tract_MW[(tract_MW$PopDens >= 1000),]
tract_SOu <- tract_SO[(tract_SO$PopDens >= 1000),]
tract_WTu <- tract_WT[(tract_WT$PopDens >= 1000),]

glm_Fcu <- lme4::lmer(glm, data = county_Fu, REML = F)
glm_Ftu <- lme4::lmer(glm, data = tract_Fu, REML = F)
glm_NEcu <- lme4::lmer(glm, data = county_NEu, REML = F)
glm_NEtu <- lme4::lmer(glm, data = tract_NEu, REML = F)
glm_MWcu <- lm(
  PcPark ~ PopDens + MedHVal + PcPov + Gini + PcHighSc + PcColl + PcUnemp + PcEmpNR +
    PcBlack + PcAsian + PcHisp + Pc65Up + PcFem + TotPop,
  data = county_MWu
) # small Ns in several Midwestern states required OLS rather than mixed models
glm_MWtu <- lme4::lmer(glm, data = tract_MWu, REML = F)
glm_SOcu <- lme4::lmer(glm, data = county_SOu, REML = F)
glm_SOtu <- lme4::lmer(glm, data = tract_SOu, REML = F)
glm_WTcu <- lme4::lmer(glm, data = county_WTu, REML = F)
glm_WTtu <- lme4::lmer(glm, data = tract_WTu, REML = F)

# report results
sjPlot::tab_model(
  glm_Fcu,
  glm_Ftu,
  glm_NEcu,
  glm_NEtu,
  glm_MWcu,
  glm_MWtu,
  glm_SOcu,
  glm_SOtu,
  glm_WTcu,
  glm_WTtu,
  dv.labels = c("Counties", "Tracts"),
  show.intercept = F,
  show.est = F,
  show.se = F,
  show.ci = F,
  show.std	= T,
  show.p = T,
  show.r2 = T,
  show.re.var = T,
  show.fstat = T,
  show.aic = T
)

# ----- Create ggplot theme for regression models -----
glm_theme <- theme_bw() +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank()) +
  theme(
    axis.text = element_text(size = 15, face = "bold"),
    legend.text = element_text(size = 15),
    legend.title = element_text(size = 18, face = "bold")
  )
theme(
  panel.border = element_blank(),
  panel.grid.major = element_blank(),
  panel.grid.minor = element_blank(),
  axis.line = element_line(colour = "black")
)

# ----- model plots for counties -----

glm_plots_counties <- plot_models(
  glm_WTc,
  glm_SOc,
  glm_MWc,
  glm_NEc,
  glm_Fc,
  std.est = TRUE,
  p.shape = TRUE,
  axis.labels = c(
    "Total population",
    "% female",
    "% 65+ years",
    "% Hispanic",
    "%NH Asian",
    "% NH Black",
    "% employed natural resources",
    "% unemployed",
    "% college degree",
    "% high school degree",
    "Gini index",
    "% poverty",
    "Median home value",
    "Population density"
  ),
  legend.title = "Regions",
  m.labels = c("West",
               "South",
               "Midwest",
               "Northeast",
               "Nationwide"),
  spacing = .6,
  dot.size = 3,
  line.size = .8,
  colors = c("#3B6AA8",
             "#32A229",
             "#EDA247",
             "#DB4325",
             "#F8DE27"),
  show.legend = T,
  grid = F
) +
  glm_theme +
  geom_hline(
    yintercept = 0,
    size = 1,
    linetype = "dashed",
    color = "black"
  ) +
  ylim(-.8, .8)

ggsave(
  "../glm_plots_counties.png",
  plot = glm_plots_counties,
  width = 8,
  height = 12,
  units = "in",
  dpi = 300
)

# ----- model plots for tracts -----

glm_plots_tracts <- plot_models(
  glm_WTt,
  glm_SOt,
  glm_MWt,
  glm_NEt,
  glm_Ft,
  std.est = TRUE,
  p.shape = TRUE,
  axis.labels = c(
    "Total population",
    "% female",
    "% 65+ years",
    "% Hispanic",
    "%NH Asian",
    "% NH Black",
    "% employed natural resources",
    "% unemployed",
    "% college degree",
    "% high school degree",
    "Gini index",
    "% poverty",
    "Median home value",
    "Population density"
  ),
  legend.title = "Regions",
  m.labels = c("West",
               "South",
               "Midwest",
               "Northeast",
               "Nationwide"),
  spacing = .6,
  dot.size = 3,
  line.size = .8,
  colors = c("#3B6AA8",
             "#32A229",
             "#EDA247",
             "#DB4325",
             "#F8DE27"),
  show.legend = T,
  grid = F
) +
  glm_theme +
  geom_hline(
    yintercept = 0,
    size = 1,
    linetype = "dashed",
    color = "black"
  ) +
  ylim(-.8, .8)

ggsave(
  "../glm_plots_tracts.png",
  plot = glm_plots_tracts,
  width = 8,
  height = 12,
  units = "in",
  dpi = 300
)

# ----- model plots for counties with income -----

glm2_plots_counties <- plot_models(
  glm2_WTc,
  glm2_SOc,
  glm2_MWc,
  glm2_NEc,
  glm2_Fc,
  std.est = TRUE,
  p.shape = TRUE,
  axis.labels = c(
    "Total population",
    "% female",
    "% 65+ years",
    "% Hispanic",
    "%NH Asian",
    "% NH Black",
    "Median household income",
    "Population density"
  ),
  legend.title = "Regions",
  m.labels = c("West",
               "South",
               "Midwest",
               "Northeast",
               "Nationwide"),
  spacing = .6,
  dot.size = 3,
  line.size = .8,
  colors = c("#3B6AA8",
             "#32A229",
             "#EDA247",
             "#DB4325",
             "#F8DE27"),
  show.legend = T,
  grid = F
) +
  glm_theme +
  geom_hline(
    yintercept = 0,
    size = 1,
    linetype = "dashed",
    color = "black"
  ) +
  ylim(-.8, .8)

ggsave(
  "../glm2_plots_counties_income.png",
  plot = glm2_plots_counties,
  width = 8,
  height = 10,
  units = "in",
  dpi = 300
)

# ----- model plots for tracts with income -----

glm2_plots_tracts <- plot_models(
  glm2_WTt,
  glm2_SOt,
  glm2_MWt,
  glm2_NEt,
  glm2_Ft,
  std.est = TRUE,
  p.shape = TRUE,
  axis.labels = c(
    "Total population",
    "% female",
    "% 65+ years",
    "% Hispanic",
    "%NH Asian",
    "% NH Black",
    "Median household income",
    "Population density"
  ),
  legend.title = "Regions",
  m.labels = c("West",
               "South",
               "Midwest",
               "Northeast",
               "Nationwide"),
  spacing = .6,
  dot.size = 3,
  line.size = .8,
  colors = c("#3B6AA8",
             "#32A229",
             "#EDA247",
             "#DB4325",
             "#F8DE27"),
  show.legend = T,
  grid = F
) +
  glm_theme +
  geom_hline(
    yintercept = 0,
    size = 1,
    linetype = "dashed",
    color = "black"
  ) +
  ylim(-.8, .8)

ggsave(
  "../glm2_plots_tracts_income.png",
  plot = glm2_plots_tracts,
  width = 8,
  height = 10,
  units = "in",
  dpi = 300
)

# ------------------------------------------------------------------------%
# Complete ----------------------------------------------------------------
# ------------------------------------------------------------------------%

beep(3) # fun sound for completion
kmcd39/divM documentation built on Oct. 21, 2023, 11:28 p.m.